library(ggplot2)
  library(reshape2)
  library(dplyr)
  library(tidyr)
  library(GGally)
  library(grid)
  "%&%" = function(a,b) paste(a,b,sep="")
  source('/Volumes/im-lab/nas40t2/hwheeler/PrediXcan-Paper/scripts/Heather/make-figures/multiplot.R')
  my.dir <- '/Volumes/im-lab/nas40t2/hwheeler/cross-tissue/'
  fig.dir <- '~/GitHub/GenArch/GenArchPaper/Figures/'
  exp.dir <- my.dir %&% "gtex-rnaseq/ind-tissues-RPKM/"
  h2.dir <- '/Volumes/im-lab/nas40t2/Data/Annotations/heritability/'

FigSx DGN h2 v. exp level

se <- function(x) sqrt(var(x)/length(x))
h2 <- read.table('/Volumes/im-lab/nas40t2/hwheeler/PrediXcan_CV/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
rawcounts <- read.table('/Volumes/im-lab/nas40t2/Data/Transcriptome/WB1K/data_used_for_eqtl_study/raw_counts.txt',header=T)
expmean<-data.frame(mean.exp=colMeans(rawcounts),se.exp=apply(rawcounts,2,se)) %>%mutate(gene=as.factor(colnames(rawcounts)))
all <- inner_join(expmean,h2,by='gene',copy=TRUE)
ngenes<-dim(all)[[1]]
a<-ggplot(all,aes(x=log10(mean.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(mean expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " genes)") + theme_bw(20)+ geom_smooth(method = "lm")
a

summary(lm(h2~mean.exp,all))
## 
## Call:
## lm(formula = h2 ~ mean.exp, data = all)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12348 -0.10470 -0.06199  0.04329  0.81936 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.235e-01  1.380e-03  89.488  < 2e-16 ***
## mean.exp    -3.825e-07  1.085e-07  -3.525 0.000425 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1558 on 13623 degrees of freedom
## Multiple R-squared:  0.0009114,  Adjusted R-squared:  0.000838 
## F-statistic: 12.43 on 1 and 13623 DF,  p-value: 0.0004246
b<-ggplot(all,aes(x=log10(se.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(SE expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " genes)") + theme_bw(20)+ geom_smooth(method = "lm")
b

summary(lm(h2~se.exp,all))
## 
## Call:
## lm(formula = h2 ~ se.exp, data = all)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.12283 -0.10476 -0.06223  0.04315  0.82021 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.228e-01  1.355e-03  90.641   <2e-16 ***
## se.exp      -1.017e-05  4.057e-06  -2.508   0.0122 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1559 on 13623 degrees of freedom
## Multiple R-squared:  0.0004615,  Adjusted R-squared:  0.0003881 
## F-statistic: 6.289 on 1 and 13623 DF,  p-value: 0.01216
tiff(filename=fig.dir %&% "FigSx-h2_v_exp_rawcounts.tiff",width=960,height=480)
multiplot(a,b,cols=2)
dev.off()
## quartz_off_screen 
##                 2
png(filename=fig.dir %&% "FigSx-h2_v_exp_rawcounts.png",width=960,height=480)
multiplot(a,b,cols=2)
dev.off()
## quartz_off_screen 
##                 2
###only include genes with h2>0.1
filall<-filter(all,h2>0.1)
ngenes<-dim(filall)[[1]]
a<-ggplot(filall,aes(x=log10(mean.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(mean expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " h2 > 0.1 genes)") + theme_bw(20)+ geom_smooth(method = "lm")
a

summary(lm(h2~mean.exp,filall))
## 
## Call:
## lm(formula = h2 ~ mean.exp, data = filall)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17465 -0.12825 -0.05403  0.08148  0.66783 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.748e-01  2.409e-03 114.102   <2e-16 ***
## mean.exp    -1.705e-07  2.345e-07  -0.727    0.467    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1652 on 5055 degrees of freedom
## Multiple R-squared:  0.0001046,  Adjusted R-squared:  -9.321e-05 
## F-statistic: 0.5288 on 1 and 5055 DF,  p-value: 0.4671
b<-ggplot(filall,aes(x=log10(se.exp),y=h2)) + geom_point() + xlab(expression(paste("log"[10],"(SE expression raw counts)"))) + ylab(expression("Local h"^2)) + ggtitle("DGN-WB (" %&% ngenes %&% " h2 > 0.1 genes)") + theme_bw(20)+ geom_smooth(method = "lm")
b

summary(lm(h2~se.exp,filall))
## 
## Call:
## lm(formula = h2 ~ se.exp, data = filall)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.17470 -0.12809 -0.05404  0.08182  0.66813 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.743e-01  2.396e-03 114.511   <2e-16 ***
## se.exp      6.629e-07  1.259e-05   0.053    0.958    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1652 on 5055 degrees of freedom
## Multiple R-squared:  5.487e-07,  Adjusted R-squared:  -0.0001973 
## F-statistic: 0.002774 on 1 and 5055 DF,  p-value: 0.958

we do not see h2 dependence on expression level in contrast to what is reported for array based h2 estimates–see Wright et al Table 2.

TO DO: plot a) array based h2 vs. RPKM in gtex, use alkes and or fred wright’s h2 estimates b) RNAseq h2 vs RPKM in gtex

Price h2 v GTEx RPKM

##get GTEx TW local h2
twh2 <- read.table(my.dir %&% "GTEx_Tissue-Wide_local_h2_se_geneinfo_no_description.txt",header=TRUE) 
twh2 <- dplyr::rename(twh2,gene=AssociatedGeneName,ensid=EnsemblGeneID)

##get price h2 (custom-made human array containing 23,720 unique oligonucleotide probes, http://www.nature.com/nature/journal/v452/n7186/full/nature06758.html)
price <- read.table(h2.dir %&% "Alkes/h2all.txt",header=TRUE) %>% dplyr::rename(gene=gname)
head(price)
##      gene h2adip h2blood h2adipcis h2adiptra h2bloodcis h2bloodtra
## 1   A2BP1  0.361   0.038     0.151     0.235      0.210      0.376
## 2     A2M  0.193   0.228     0.199    -0.006      0.005      0.223
## 3   A2ML1  0.125   0.130     0.003     0.122      0.067      0.063
## 4 A3GALT2  0.231   0.058     0.225     0.006     -0.101      0.159
## 5  A4GALT  0.176   0.322    -0.275     0.451      0.364     -0.042
## 6   A4GNT  0.629   0.265    -0.039     0.668      0.051      0.214
summary(price)
##       gene           h2adip           h2blood          h2adipcis       
##  A2BP1  :    1   Min.   :-0.2470   Min.   :-0.2550   Min.   :-0.42200  
##  A2M    :    1   1st Qu.: 0.1370   1st Qu.: 0.0510   1st Qu.:-0.03100  
##  A2ML1  :    1   Median : 0.2345   Median : 0.1400   Median : 0.06100  
##  A3GALT2:    1   Mean   : 0.2511   Mean   : 0.1679   Mean   : 0.07464  
##  A4GALT :    1   3rd Qu.: 0.3480   3rd Qu.: 0.2560   3rd Qu.: 0.16500  
##  A4GNT  :    1   Max.   : 0.9640   Max.   : 0.9030   Max.   : 0.89100  
##  (Other):15072                     NA's   :253                         
##    h2adiptra         h2bloodcis         h2bloodtra     
##  Min.   :-0.4570   Min.   :-0.39200   Min.   :-0.4480  
##  1st Qu.: 0.0730   1st Qu.:-0.02500   1st Qu.: 0.0030  
##  Median : 0.1940   Median : 0.05600   Median : 0.1080  
##  Mean   : 0.1974   Mean   : 0.07145   Mean   : 0.1128  
##  3rd Qu.: 0.3170   3rd Qu.: 0.14900   3rd Qu.: 0.2170  
##  Max.   : 1.0250   Max.   : 0.69000   Max.   : 0.8090  
##                    NA's   :253        NA's   :253
a<-inner_join(twh2,price,"gene")
dim(a)
## [1] 11360    28
ggplot(a,aes(h2.WholeBlood,h2bloodcis)) + geom_point(alpha=1/4) + geom_smooth() + theme_bw()

cor.test(a$h2.WholeBlood,a$h2bloodcis,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  a$h2.WholeBlood and a$h2bloodcis
## S = 2.0757e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.1077962
cor.test(a$h2.WholeBlood,a$h2blood,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  a$h2.WholeBlood and a$h2blood
## S = 2.098e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.09823263
cor.test(a$h2.WholeBlood,a$h2bloodtra,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  a$h2.WholeBlood and a$h2bloodtra
## S = 2.3193e+11, p-value = 0.7432
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.003099754
tislist <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
rpkm <- data.frame()
for(tis in tislist){
  tisrpkm <- read.table(exp.dir %&% tis %&% ".meanRPKM.txt",header=TRUE)  
  combo <- inner_join(a,tisrpkm,by='ensid') %>% dplyr::select(ensid,meanRPKM,h2adipcis,h2bloodcis) %>% mutate(tissue=tis)
  res<-cor.test(combo$h2bloodcis,log10(combo$meanRPKM),method='p')
  cat(tis," RPKM v Price h2bloodcis: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
  res<-cor.test(combo$h2adipcis,log10(combo$meanRPKM),method='p')
  cat(tis," RPKM v Price h2adipcis: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
  rpkm <- rbind(rpkm,combo)
}
## Adipose - Subcutaneous RPKM v Price h2bloodcis: R=0.11 p=0
## Adipose - Subcutaneous RPKM v Price h2adipcis: R=0.12 p=0
## Artery - Tibial RPKM v Price h2bloodcis: R=0.096 p=0
## Artery - Tibial RPKM v Price h2adipcis: R=0.12 p=0
## Heart - Left Ventricle RPKM v Price h2bloodcis: R=0.095 p=0
## Heart - Left Ventricle RPKM v Price h2adipcis: R=0.12 p=0
## Lung RPKM v Price h2bloodcis: R=0.12 p=0
## Lung RPKM v Price h2adipcis: R=0.1 p=0
## Muscle - Skeletal RPKM v Price h2bloodcis: R=0.081 p=0
## Muscle - Skeletal RPKM v Price h2adipcis: R=0.11 p=0
## Nerve - Tibial RPKM v Price h2bloodcis: R=0.1 p=0
## Nerve - Tibial RPKM v Price h2adipcis: R=0.12 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Price h2bloodcis: R=0.078 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Price h2adipcis: R=0.083 p=0
## Thyroid RPKM v Price h2bloodcis: R=0.11 p=0
## Thyroid RPKM v Price h2adipcis: R=0.12 p=0
## Whole Blood RPKM v Price h2bloodcis: R=0.14 p=0
## Whole Blood RPKM v Price h2adipcis: R=0.068 p=3.7e-13
ggplot(rpkm,aes(x=log10(meanRPKM),y=h2bloodcis))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Price blood h2 vs. GTEx mean RPKM")+theme_bw()

ggplot(rpkm,aes(x=log10(meanRPKM),y=h2adipcis))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Price adipose h2 vs. GTEx mean RPKM")+theme_bw()

##price h2 vs. DGN raw counts
dgnprice <- inner_join(all,price,by='gene')
ggplot(dgnprice,aes(x=log10(mean.exp),y=h2bloodcis))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Price blood h2 vs. DGN mean counts")

cor.test(log10(dgnprice$mean.exp),dgnprice$h2bloodcis,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  log10(dgnprice$mean.exp) and dgnprice$h2bloodcis
## S = 1.3242e+11, p-value = 0.2269
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.01253105
ggplot(dgnprice,aes(x=log10(mean.exp),y=h2adipcis))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Price adipose h2 vs. DGN mean counts")

cor.test(log10(dgnprice$mean.exp),dgnprice$h2adipcis,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  log10(dgnprice$mean.exp) and dgnprice$h2adipcis
## S = 1.3912e+11, p-value = 0.5721
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.00581726

Wright h2 v GTEx RPKM

##get GTEx TW local h2
twh2 <- read.table(my.dir %&% "GTEx_Tissue-Wide_local_h2_se_geneinfo_no_description.txt",header=TRUE) 
twh2 <- dplyr::rename(twh2,gene=AssociatedGeneName,ensid=EnsemblGeneID)

wright <- read.table(h2.dir %&% "Fred/h2PB.txt",header=TRUE) %>% dplyr::rename(gene=gname)
head(wright)
##      gene        h2PB
## 1    A1BG -0.25582491
## 2    A1CF  0.05112739
## 3     A2M  0.49116590
## 4   A2ML1  0.10640690
## 5 A3GALT2  0.20743214
## 6  A4GALT  0.03834043
summary(wright)
##       gene            h2PB         
##  A1BG   :    1   Min.   :-0.40792  
##  A1CF   :    1   1st Qu.: 0.01551  
##  A2M    :    1   Median : 0.10140  
##  A2ML1  :    1   Mean   : 0.10510  
##  A3GALT2:    1   3rd Qu.: 0.18750  
##  A4GALT :    1   Max.   : 0.90467  
##  (Other):17786
a<-inner_join(twh2,wright,"gene")
dim(a)
## [1] 15805    23
ggplot(a,aes(h2.WholeBlood,h2PB)) + geom_point(alpha=1/4) + geom_smooth() + theme_bw()

cor.test(a$h2.WholeBlood,a$h2PB,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  a$h2.WholeBlood and a$h2PB
## S = 6.2666e+11, p-value = 2.055e-09
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##        rho 
## 0.04764976
tislist <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
rpkm <- data.frame()
for(tis in tislist){
  tisrpkm <- read.table(exp.dir %&% tis %&% ".meanRPKM.txt",header=TRUE)  
  combo <- inner_join(a,tisrpkm,by='ensid') %>% dplyr::select(ensid,meanRPKM,h2PB) %>% mutate(tissue=tis)
  res<-cor.test(combo$h2PB,log10(combo$meanRPKM),method='p')
  cat(tis," RPKM v Wright h2PB: R=",signif(res$estimate,2)," p=",signif(res$p.value,2),"\n",sep="")
  rpkm <- rbind(rpkm,combo)
}
## Adipose - Subcutaneous RPKM v Wright h2PB: R=0.19 p=0
## Artery - Tibial RPKM v Wright h2PB: R=0.18 p=0
## Heart - Left Ventricle RPKM v Wright h2PB: R=0.17 p=0
## Lung RPKM v Wright h2PB: R=0.23 p=0
## Muscle - Skeletal RPKM v Wright h2PB: R=0.15 p=0
## Nerve - Tibial RPKM v Wright h2PB: R=0.18 p=0
## Skin - Sun Exposed (Lower leg) RPKM v Wright h2PB: R=0.15 p=0
## Thyroid RPKM v Wright h2PB: R=0.18 p=0
## Whole Blood RPKM v Wright h2PB: R=0.29 p=0
ggplot(rpkm,aes(x=log10(meanRPKM),y=h2PB))+geom_point(alpha=1/4)+geom_smooth(method="lm")+facet_wrap(~tissue)+ggtitle("Wright blood h2 vs. GTEx mean RPKM")+theme_bw()

##wright h2 vs. DGN raw counts
dgnwright <- inner_join(all,wright,by='gene')
ggplot(dgnwright,aes(x=log10(mean.exp),y=h2PB))+ geom_point(alpha=1/2) + geom_smooth() + theme_bw()+ggtitle("Wright blood h2 vs. DGN mean counts")

cor.test(log10(dgnwright$mean.exp),dgnwright$h2PB,method='s')
## 
##  Spearman's rank correlation rho
## 
## data:  log10(dgnwright$mean.exp) and dgnwright$h2PB
## S = 2.3087e+11, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.2651532

GTEx h2 v GTEx RPKM

tislist <- scan(my.dir %&% 'nine.tissue.list',sep="\n",what="character")
tislist2 <- scan(my.dir %&% "nine.spaces.tissue.list","c",sep="\n")
tw <- data.frame()
for(i in 1:length(tislist)){
  tisrpkm <- read.table(exp.dir %&% tislist2[i] %&% ".meanRPKM.txt",header=TRUE)
  h2 <- read.table(my.dir %&% "gtex-h2-estimates/GTEx.tissue-wide.h2_" %&% tislist[i] %&% "_marginal.local_2015-03-24.txt",header=T, sep="\t") %>% select(tissue,ensid,h2)
  subdata <- inner_join(h2,tisrpkm,by="ensid")
  res<-cor.test(log10(subdata$meanRPKM),subdata$h2)
  cat(tislist[i],"\tPearson R=",round(res$estimate,3),"\tP-value=",res$p.value,"\n")
  tw <- rbind(tw,subdata)
}
## Adipose-Subcutaneous     Pearson R= 0.072    P-value= 0 
## Artery-Tibial    Pearson R= 0.06     P-value= 4.440892e-16 
## Heart-LeftVentricle  Pearson R= 0.048    P-value= 6.506884e-11 
## Lung     Pearson R= 0    P-value= 0.9709712 
## Muscle-Skeletal  Pearson R= 0.042    P-value= 9.302326e-09 
## Nerve-Tibial     Pearson R= 0.095    P-value= 0 
## Skin-SunExposed(Lowerleg)    Pearson R= 0.08     P-value= 0 
## Thyroid  Pearson R= 0.059    P-value= 4.440892e-16 
## WholeBlood   Pearson R= 0.083    P-value= 0
ggplot(tw,aes(x=log10(meanRPKM),y=h2))+geom_point(alpha=0.4)+ylab(expression("GCTA h"^2))+xlab(expression(paste("log"[10],"(meanRPKM)")))+geom_smooth(method="lm") + facet_wrap(~tissue,ncol=3)+theme_bw()+ggtitle("GTEx TW local h2 vs. GTEx mean RPKM")